In [3]:
# standard
from IPython import embed
import pandas as pd
import numpy as np
# frameworks
from frameworks.seq2seq_keras.models import AttentionSeq2Seq
from gensim.models import Word2Vec
# custom
from data_utils import get_train_data
from word2vec import get_word_embedding
from vocab import get_vocab
In [7]:
_BATCH_SIZE = 64
_VOCAB_SIZE = 6000
_WORD_DIM = 128
_MODEL_DEPTH = 4
_INPUT_LENGTH = 25
_OUTPUT_LENGTH = 10
In [3]:
model = AttentionSeq2Seq(input_length=_INPUT_LENGTH,
input_dim=_WORD_DIM,
hidden_dim=_WORD_DIM,
output_length=_OUTPUT_LENGTH,
output_dim=_WORD_DIM,
depth=_MODEL_DEPTH)
model.compile(loss='mse', optimizer='rmsprop')
In [8]:
embedding = get_word_embedding(_WORD_DIM)
In [7]:
train_data = get_train_data()
_, ch2int = get_vocab()
In [8]:
len(train_data)
Out[8]:
In [9]:
def pad_to(lst, length, value):
for i in range(len(lst), length):
lst.append(value)
return lst
def clean_train_data(train_data):
X_train = []
Y_train = []
for idx in xrange(len(train_data)):
line_number = idx % 4
keyword = train_data[idx]['keyword']
current_sentence = train_data[idx]['sentence']
previous_sentences = ''.join([train_data[idx - i]['sentence'] for i in range(line_number, 0, -1)])
X_entry = pad_to([[ch2int[ch]] for ch in (keyword + previous_sentences)], 25, [_VOCAB_SIZE - 1])
Y_entry = pad_to([[ch2int[ch]] for ch in current_sentence], 10, [_VOCAB_SIZE - 1])
X_train.append(X_entry)
Y_train.append(Y_entry)
return X_train, Y_train
In [10]:
X_train, Y_train = clean_train_data(train_data)
In [13]:
X_train_embedded = [map(lambda x: embedding[x[0]], sample) for sample in X_train]
In [14]:
Y_train_embedded = [map(lambda x: embedding[x[0]], sample) for sample in Y_train]
In [15]:
model.fit(X_train_embedded, Y_train_embedded, epochs=1, verbose=1)
Out[15]:
In [16]:
kw = u'山水'
In [17]:
kw_pad = [pad_to([[ch2int[ch]] for ch in kw], 25, [_VOCAB_SIZE - 1])]
In [18]:
kw_embed = [map(lambda x: embedding[x[0]], sample) for sample in kw_pad]
In [19]:
kw_embed_array = np.array(kw_embed)
In [20]:
pred = model.predict(kw_embed_array)
pred
Out[20]:
In [21]:
w2v_model = Word2Vec.load('data/word2vec.model')
In [22]:
result = []
for i in range(len(pred[0])):
result.append(w2v_model.most_similar(positive=[pred[0][i]], topn=1))
In [4]:
for r in result:
print r[0][0]
In [ ]: